
###########################################################
##### Haiti elite network project  		          			#####
##### cleaning Haiti BM and Orbis ownership data  		#####
##### 2021 mar 03                   									#####
###########################################################


## read in list of names and types from trade data
## create last name variable
## match list of comp names to corp data
## match list of ind names to genealogy


##########
## ownership data - Haiti Building Markets
##########

## read in multiple scrapes of Haiti BM website

own <- read.csv('01_Data/01_Raw/03_Family/bm_scrape_8000_18000.csv')
t1 <- read.csv('01_Data/01_Raw/03_Family/bm_scrape_11643_18000.csv')
own <- smartbind(own, t1)
own <- subset(own, select = c(id, Operating.since.))

temp <- read.csv('01_Data/01_Raw/03_Family/bm_scrape_all.csv')
temp <- subset(temp, select = -c(Operating.since.))
own <- merge(own, temp, by = c("id"), all = T)


## rename variables

colnames(own)[1:18] <- c("id", "fdate", "comp", "owner", "sector", "phone", "desc", "locprod", 
                         "pctimp", "imp", "emp", "clitype","goods",
                         "landmark","pctempha","nif","regdate","impfrom")
own <- subset(own, own$comp!="NONE" & own$comp!="",
              select = c(id:goods,impfrom))
own$phone=NULL


#####
## clean owner names
#####

## fix a few by hand

own$owner <- gsub("St-LOUIS", "ST-LOUIS", own$owner)
own$owner <- gsub("Bien-Aime", "BIEN-AIME", own$owner)
own <- subset(own, own$owner!="Owners: NA")

## delete middle initials

own$owner <- gsub("[ |.][A-Z]{1,1}[ |.]", " ", own$owner)

## narrow down to owner names

own$owner <- sapply(strsplit(own$owner, "\n{1,}"),"[[",2)
own$owner <- gsub("^ +", "", own$owner)
own$owner <- ifelse(own$owner=="N/A", NA, own$owner)

# take out last names that are all caps

temp <- strsplit(own$owner, ",")
temp2 <- lapply(temp,
                FUN = function(x) str_extract(string = x, pattern = "([ ]*[A-Z]*[ ]*[A-Z]*[-| ][A-Z]{2,})"))
length <- sapply(temp2, length)

temp3 <- matrix(NA, length(temp2), 8)
for (i in 1:length(temp2)){
  temp3[i,1:length[i]] <- unlist(temp2[i])
}
colnames(temp3) <- paste(rep('fam_',8),seq(1:8),sep="")

for (i in 1:dim(temp3)[2]){
  temp3[,i] <- gsub("^ +", "", temp3[,i])
}

own <- cbind.data.frame(own,temp3)

# take out last names that are not caps

temp4 <- lapply(temp,
                FUN = function(x) str_extract(string = x, pattern = "[A-Za-z]+ *$"))

for (i in 1:length(temp4)){
  if(is.na(own$fam_1[i])==T){
    temp5 <- unique(unlist(temp4[i]))
    own[i,(which(colnames(own)=='fam_1'):(which(colnames(own)=='fam_1')+length(temp5)-1))] <- temp5
  }
}


## subset and delete duplicated

own <- subset(own, select = c(comp, fdate, emp, impfrom, fam_1:fam_8, sector, goods))

table(duplicated(own))
own <- subset(own, duplicated(own)==F)


## make long - extend by fam

own2 <- reshape(own, dir = "long", varying = c(5:12), sep = "_")
own2$fam <- toupper(own2$fam)
own2 <- subset(own2, select = c(comp, fam, sector, goods, emp, fdate, impfrom),
               duplicated(own2)==F)

# little more last name cleaning

own2$fam <- gsub("JN ", "JEAN ", own2$fam)
own2$fam <- gsub("JN$", "JEAN", own2$fam)
own2$fam <- gsub("JN-", "JEAN-", own2$fam)
own2$fam <- gsub(" *$", "", own2$fam)
own2$fam <- gsub("ST-", "SAINT ", own2$fam)
own2$fam <- gsub("^ST ", "SAINT ", own2$fam)
own2$fam <- gsub(" I{2,}$", "", own2$fam)

own2 <- subset(own2, own2$fam!="P ST" & own2$fam!="A")


## fix year in BM

own2$fdate <- ifelse(own2$fdate=="","NA/NA",own2$fdate)
own2$fyear <- sapply(strsplit(own2$fdate, "/", fixed = T),"[[",2)


## merge in final comp names

own2$comp <- toupper(own2$comp)
own2$comp <- gsub("+ $", "", own2$comp)

ha <- read.csv("01_Data/02_Clean/ha_comps.csv")

own2$comp_final <- NA

for (i in 1:15){
  temp <- subset(ha, select = c(2, 2+i))
  colnames(temp)[2] <- "con_orig"
  own2 <- merge(own2, temp, by.x = "comp", by.y = "con_orig", all.x = T)
  colnames(own2)[9+i] <- paste("comp_final", i, sep=".")
  own2$comp_final <- ifelse(is.na(own2$comp_final)==T, own2[,9+i], own2$comp_final)
}

table(is.na(own2$comp_final))
unique(own2$comp[is.na(own2$comp_final)==T])
# there are some comps (mostly NGOs) that are not matching properly!

own2$comp_final[is.na(own2$comp_final)==T] <- own2$comp[is.na(own2$comp_final)==T] 

own2 <- subset(own2, select = c(comp:comp_final))

famb <- unique(subset(own2, select = c(comp_final, fyear, fam)))


## make dummy for haiti BM
famb$bm <- 1


##########
## ownership data - Orbis
##########

## read in orbis

orb <- read.csv('01_Data/01_Raw/03_Family/orbis_20140222.csv')
colnames(orb) <- c("comp_orig","last_year","revenue","employees","fyear",
                   "dmc_full","dmc_last","db_full","db_last",
                   "bvd_sector","category","trade_desc","bvd_sector_1",
                   "nace_main","nace_primary","nace_primary_desc","nace_secondary","nace_secondary_desc",
                   "sic_primary","sic_primary_desc","sic_secondary","sic_secondary_desc",
                   "naics_core","naics_core_desc",
                   "naics_primary","naics_primary_desc","naics_secondary","naics_secondary_desc",
                   "primary_business","main_products")

## clean up vars
orb$revenue <- as.numeric(gsub(",", "", orb$revenue))
orb$employees <- as.numeric(gsub(",", "", orb$employees))
orb$emp <- ifelse(orb$employees < 10, "less than 10", 
                  ifelse(orb$employees >=10 & orb$employees < 50, "10 - 50",
                         ifelse(orb$employees >= 51 & orb$employees < 300, "51 - 300",
                                ifelse(orb$employees > 300, "more than 300", NA))))

## merge in final comp names
for (i in 1:15) {
  temp <- data.frame('comp_final'=ha[,'consignee'],
                     'comp_orig'=ha[,(which(colnames(ha)=='consignee')+i)])
  orb <- merge(orb,temp,by='comp_orig',all.x=T)
  # colnames(orb)[(31+i)] <- paste('comp_final',i,sep=".")
  setnames(orb, 'comp_final', paste('comp_final', i, sep="."))
}
orb$comp_final <- orb$comp_final.1

for (i in 2:15) {
  orb$comp_final[is.na(orb$comp_final)==T] <- 
    orb[,(which(colnames(orb)=='comp_final.1')-1+i)][is.na(orb$comp_final)==T]
}
orb <- subset(orb,select=-c(comp_final.1:comp_final.15))

## comp info
comp <- data.frame(unique(subset(orb, select = c(comp_final, last_year, revenue, fyear, emp))))
comp <- comp[order(comp$comp_final, comp$fyear, decreasing=T),]
comp$dup <- duplicated(comp$comp_final)
comp <- subset(comp, comp$dup==F)
comp$dup=NULL

## families
temp1 <- data.frame(unique(subset(orb, select = c(comp_final, dmc_last))))
temp2 <- data.frame(unique(subset(orb, select = c(comp_final, db_last))))
setnames(temp1, 'dmc_last', 'bm_last')
setnames(temp2, 'db_last', 'bm_last')

fam <- smartbind(temp1, temp2)

fam$bm_last <- ifelse(fam$bm_last=="There is no Directors / managers / contacts information for this company" |
                      fam$bm_last=="There is no D&B executives information for this company" | 
                      fam$bm_last=="",
                      NA, fam$bm_last)

fam <- data.frame(unique(fam))
fam <- subset(fam, is.na(fam$bm_last)==F)

## make fam upper case
fam$fam <- toupper(fam$bm_last)
fam$bm_last=NULL

## fix year
temp <- unique(subset(orb, select = c(comp_final, fyear)))
temp <- subset(temp, temp$fyear!='')
temp$fyear <- gsub(",", "", temp$fyear)
fam <- merge(fam, temp, by = 'comp_final', all.x=T)

## make dummy for orbis
fam$orb <- 1


#####
## read in some original matches
#####

orig <- read.csv('01_Data/01_Raw/03_Family/own_orig.csv')
orig$orig <- 1
setnames(orig, 'con_final', 'comp_final')


##########
## merge it together
##########

## merge BM, orbis, orig
fam2 <- merge(fam, famb, by = c('comp_final', 'fam'), all = T)
fam2 <- subset(fam2, fam2$fam!="NA", select = -c(fyear.x, fyear.y))
fam2 <- smartbind(fam2, orig)

## make table of years
years <- unique(smartbind(subset(fam, select = c(comp_final, fyear)),
                          subset(famb, select = c(comp_final, fyear))))
years <- subset(years, is.na(years$fyear)==F)
years <- years[order(years$comp_final, years$fyear),]
years$dup <- duplicated(years$comp_final)
years <- subset(years, years$dup==F & years$comp_final!='NA')

## merge in years
fam2 <- merge(fam2, years, by = 'comp_final', all.x=T)

## fix source dummies
fam2$orb <- ifelse(is.na(fam2$orb)==T, 0, fam2$orb)
fam2$bm <- ifelse(is.na(fam2$bm)==T, 0, fam2$bm)
fam2$orig <- ifelse(is.na(fam2$orig)==T, 0, fam2$orig)

## delete missing comp names
fam2 <- subset(fam2, fam2$comp_final!='NA')


#####
## merge with agemar
#####

## read in agemar
age <- read.csv("01_Data/02_Clean/agemar_ha.csv")
age <- subset(age, age$year!=2010)
setnames(age, 'con_final', 'comp_final')

## drop bottom 10% of shippers
age <- data.table(age)
cumsum2 <- function(x) { cumsum(x) / sum(x, na.rm=T) }
sum2 <- function(x) { x / sum(x, na.rm=T) }
age2 <- age[,list('wgt_kg' = sum(wgt_kg, na.rm=T)),
             by = 'hs_four,comp_final']
age2 <- age2[order(age2$hs_four, -age2$wgt_kg)]
age2 <- age2[,list('comp_final' = comp_final,
                  'wgt_cum' = cumsum2(wgt_kg), 
                  'wgt_pct' = sum2(wgt_kg)),
            by = 'hs_four']
age2 <- subset(age2, is.na(age2$hs_four)==F)
age2$drop <- ifelse(age2$wgt_cum - age2$wgt_pct > 0.9, 1, 0)
table(age2$drop)
age2 <- subset(age2, age2$drop==0)

## merge fam and age data
age <- unique(subset(age, select = c(comp_final, hs_four)))
age <- subset(age, age$hs_four!='NA')

fam3 <- merge(fam2, age, by = 'comp_final')

## fix up years and make a dummy
fam3$fyear <- as.numeric(fam3$fyear)
fam3$pre91 <- ifelse(fam3$fyear < 1992, 1, 0)

## take out blanks
fam3 <- subset(fam3, fam3$fam!="")

## write to csv

write.csv(fam3, "01_Data/02_Clean/own_clean.csv")




